In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyjags
import pickle
from __future__ import division, print_function
from pandas.tools.plotting import *
from matplotlib import rcParams
rcParams["savefig.dpi"] = 100
rcParams["font.size"] = 20
#plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
#%qtconsole
Section 1. below provides code to run one-component h and k data through the one-component HBM, and section 2. provides code to run two-component h and k data through a two-component HBM. After executing and assessing these 2 sections, make a new notebook, or rearrange this notebook in order to run one-compinent data through a two-component HBM, and two-component data through the one-component HBM, (swap the data and HBM combo) to answer Question 2 above. Pay attention to the mixture fractions when running one-component data through the two-component HBM.
In [3]:
## function to draw from truncated normal, this function will be used for both the
## one- and two-componenet cases in this workbook.
def rnorm_bound( Ndata, mu, prec, lower_bound = 0.0, upper_bound = float('Inf')):
x = np.zeros(Ndata)
#print(x)
for i in range(0, Ndata):
#print(i)
while True:
x[i] = np.random.normal(mu,prec,1)
if( (x[i]>lower_bound) and (x[i]<upper_bound) ):
break
return x;
In [4]:
## Check the output to make sure the function is outputting as expected.
#print(np.random.normal(0,0.0001,10))
print(rnorm_bound(10, 0.0, 0.001, lower_bound=-1,upper_bound=1))
Below we are now simulating the distribution of both "$\boldsymbol{e \cos \omega}$" and "$\boldsymbol{e \sin \omega}$", $h$ and $k$ respectively, for hot Jupiter exoplanets from Kepler.
In [5]:
## One-component Gaussian Mixture Simulated Data for both projected eccentricity terms
## Below we designate the population values of our generative model. These are the
## truths that we should recover if our hiararchical Bayesian model is properly specified
## and diagnostics have indicated that the simulation has "not not converged". "You can't
## prove convergence, at best you can fail to prove a failure to converge".
## In this simulated data set, their are 25 planetary systems (with one planet each)
Ndata = 25
## Here we asign the dispersion of the simulated population to be 0.3, this is
## the truth we wish to recover
sigmae = 0.3
## We approximate the uncertainty for each measurement as normally distributed about a
## reported measurement point estimate.
## For the eccentricity distribution for hot Jupiters, the physical models used to derive
## these produce larger uncertainty in k by a factor of 2.
sigmahobs = 0.04
sigmakobs = 0.08
h = np.repeat(0.,Ndata)
k = np.repeat(0.,Ndata)
hhat = np.repeat(0.,Ndata)
khat = np.repeat(0.,Ndata)
hhat_sigma = np.repeat(sigmahobs,Ndata)
khat_sigma = np.repeat(sigmakobs,Ndata)
#print(khat_sigma)
for i in range(0,Ndata):
h[i] = rnorm_bound(1,0,sigmae,lower_bound=-1,upper_bound=1)
lb = -np.sqrt(1-h[i]**2)
ub = np.sqrt(1-h[i]**2)
k[i] = rnorm_bound(1,0,sigmae,lower_bound=lb,upper_bound=ub)
hhat[i] = rnorm_bound(1,h[i],sigmahobs,lower_bound=-1,upper_bound=1)
khat[i] = rnorm_bound(1,k[i],sigmakobs,lower_bound=lb,upper_bound=ub)
## Vizualize the true data values, and the simulated measurements:
print(h, hhat, k, khat)
plt.hist(h)
plt.hist(hhat)
Out[5]:
Below is the one-component Gaussian mixture model in JAGS copied from notebook 1, and gerneralized to also include k in addition to h:
In [6]:
## JAGS user manual:
## http://www.uvm.edu/~bbeckage/Teaching/DataAnalysis/Manuals/manual.jags.pdf
## JAGS model code
## model code is the one-component truncated gaussian mixture HBM code from previous
## notebook
code1 = '''
model {
#Population parameters
e_sigma ~ dunif(0.0, 1.0)
e_phi <- 1/(e_sigma*e_sigma)
for (n in 1:Ndata){
#True planet properties
h[n] ~ dnorm(0, e_phi) T(-1,1) #Can try multivariate truncated normal in future
k[n] ~ dnorm(0, e_phi) T(-sqrt(1-h[n]*h[n]),sqrt(1-h[n]*h[n]))
#Observed planet properties
hhat[n] ~ dnorm(h[n], 1.0/(hhat_sigma[n]*hhat_sigma[n])) T(-1,1)
khat[n] ~ dnorm(k[n], 1.0/(khat_sigma[n]*khat_sigma[n])) T(-sqrt(1-hhat[n]*hhat[n]),sqrt(1-hhat[n]*hhat[n]))
}
}
'''
In [7]:
## Load additional JAGS module
pyjags.load_module('glm')
pyjags.load_module('dic')
## See blog post for origination of some of the adapted analysis tools used here and below:
## https://martynplummer.wordpress.com/2016/01/11/pyjags/
num_chains = 4
iterations = 10000
## data list include only variables in the model
model = pyjags.Model(code1, data=dict( Ndata=Ndata, hhat=hhat, khat=khat,
hhat_sigma=hhat_sigma, khat_sigma=khat_sigma),
chains=num_chains, adapt=1000)
## Code to speed up compute time. This feature might not be
## well tested in pyjags at this time.
## threads=4, chains_per_thread=1
## 500 warmup / burn-in iterations, not used for inference.
model.sample(500, vars=[])
## Run model for desired steps, monitoring hyperparameter variables, and latent variables
## for hierarchical Bayesian model.
## Returns a dictionary with numpy array for each monitored variable.
## Shapes of returned arrays are (... shape of variable ..., iterations, chains).
## samples = model.sample(#iterations per chain here, vars=['e_sigma', 'h', 'k'])
samples = model.sample(iterations, vars=['e_sigma', 'h', 'k'])
## Code to save, open and use pickled dictionary of samples:
## -- Pickle the data --
#with open('ecc_1_test.pkl', 'wb') as handle:
# pickle.dump(samples, handle)
## -- Retrieve pickled data --
#with open('ecc_1_test.pkl', 'rb') as handle:
# retrieved_results = pickle.load(handle)
In [8]:
## Generalize code for both h and k below.
#print(samples)
#print(samples.items())
## Print and check the shape of the resultant samples dictionary:
print(samples['e_sigma'].shape)
print(samples['e_sigma'].squeeze(0).shape)
print(samples['h'].shape)
print(samples['k'][0,:,:].shape)
print('-----')
## Update the samples dictionary so that it includes keys for the latent variables
## Also, we will use LaTeX formatting to help make legible plots ahead.
samples_Nm1 = {}
## adjust the thin varible to only look at every 10th population element by setting it to 10
thin = 1
## Need to enter the number of hyperparameter variables here:
numHyperParams = 1
## Specify the dimension we want for our plot below, for legibility.
dim = (Ndata/thin)*2 + numHyperParams
print(dim)
for i in np.arange(0,Ndata,thin):
#hval = 'h'+str(i+1)
#kval = 'k'+str(i+1)
#print(hval)
#print(kval)
#samples_Nm1({hval: samples['h'][i,:,:]})
samples_Nm1.update({'$h_{'+str(i+1)+'}$': samples['h'][i,:,:], '$k_{'+str(i+1)+'}$': samples['k'][i,:,:]})
#print(samples_2['h11'].shape)
## Add the hyperparameter marginal posterior back in:
samples_Nm1.update({'$e_{\sigma}$': samples['e_sigma'].squeeze(0)})
## Below, examine the updated and reformatted sample dictionary to include keys for
## latent variables
for j, i in samples_Nm1.items():
print(j)
print(i)
samples_Nm1['$h_{5}$'][0]
Out[8]:
Below is code to look at summary statistics of the marginal posterior distributions (the probabilistic parameter estimates) for the hyperparameter and the latent variables (each population constituent), in this case h and k (a.k.a projected eccentricity here), of the exoplanet systems we are simulating).
In [9]:
## equal tailed 95% credible intervals, and posterior distribution means:
def summary(samples, varname, p=95):
values = samples[varname][0]
ci = np.percentile(values, [100-p, p])
print('{:<6} mean = {:>5.1f}, {}% credible interval [{:>4.1f} {:>4.1f}]'.format(
varname, np.mean(values), p, *ci))
for varname in samples_Nm1:
summary(samples_Nm1, varname)
In [10]:
## Use pandas three dimensional Panel to represent the trace:
trace = pd.Panel({k: v for k, v in samples_Nm1.items()})
trace.axes[0].name = 'Variable'
trace.axes[1].name = 'Iteration'
trace.axes[2].name = 'Chain'
## Point estimates:
print(trace.to_frame().mean())
## Bayesian equal-tailed 95% credible intervals:
print(trace.to_frame().quantile([0.05, 0.95]))
## ^ entering the values here could be a good question part
def plot(trace, var):
fig, axes = plt.subplots(1, 3, figsize=(9, 3))
fig.suptitle(var, y=0.95, fontsize='xx-large')
## Marginal posterior density estimate:
trace[var].plot.density(ax=axes[0])
axes[0].set_xlabel('Parameter value')
axes[0].locator_params(tight=True)
## Autocorrelation for each chain:
axes[1].set_xlim(0, 100)
for chain in trace[var].columns:
autocorrelation_plot(trace[var,:,chain], axes[1], label=chain)
## Trace plot:
axes[2].set_ylabel('Parameter value')
trace[var].plot(ax=axes[2])
## Save figure
filename = var.replace("\\", "")
filename = filename.replace("$", "")
filename = filename.replace("}", "")
filename = filename.replace("{", "")
plt.tight_layout(pad=3)
fig.savefig('Nm1_JAGS_h_and_k_'+'{}.png'.format(filename))
# Display diagnostic plots
for var in trace:
plot(trace, var)
In [11]:
## Scatter matrix plot:
## Redefine the trace so that we only vizualize every 10th latent variable element in
## the scatter_matrix plot below. Vizualizing all 50 is too cumbersome for the scatter
## matrix.
samples_Nm1_for_scatter_matrix = {}
Nsamp=25
## adjust the thin varible to only look at every 10th population element by setting it to 10
thin = 10
numHyperParams = 1
dim = (Nsamp/thin)*2 + numHyperParams
print(dim)
for i in np.arange(0,Nsamp,thin):
samples_Nm1_for_scatter_matrix.update({'$h_{'+str(i+1)+'}$': samples['h'][i,:,:], '$k_{'+str(i+1)+'}$': samples['k'][i,:,:]})
samples_Nm1_for_scatter_matrix.update({'$e_{\sigma}$': samples['e_sigma'].squeeze(0)})
for j, i in samples_Nm1_for_scatter_matrix.items():
print(j)
# print(i)
trace_2 = pd.Panel({k: v for k, v in samples_Nm1_for_scatter_matrix.items()})
sm = scatter_matrix(trace_2.to_frame(), color="darkturquoise", alpha=0.2, figsize=(dim*2, dim*2), diagonal='hist',hist_kwds={'bins':25,'histtype':'step', 'edgecolor':'r','linewidth':2})
## y labels size
[plt.setp(item.yaxis.get_label(), 'size', 20) for item in sm.ravel()]
## x labels size
[plt.setp(item.xaxis.get_label(), 'size', 20) for item in sm.ravel()]
## Change label rotation
## This is helpful for very long labels
#[s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)]
[s.xaxis.label.set_rotation(0) for s in sm.reshape(-1)]
[s.yaxis.label.set_rotation(0) for s in sm.reshape(-1)]
## May need to offset label when rotating to prevent overlap of figure
[s.get_yaxis().set_label_coords(-0.5,0.5) for s in sm.reshape(-1)]
## Hide all ticks
#[s.set_xticks(()) for s in sm.reshape(-1)]
#[s.set_yticks(()) for s in sm.reshape(-1)]
plt.savefig('scatter_matrix_Nm1_h_and_k_JAGS.png')
In [12]:
## Below we designate the population values of our two-component truncated Gaussian
## generative model. These are the truths that we should recover if our hiararchical
## Bayesian model is properly specified and diagnostics have indicated that the simulation
## has "not not converged".
Ndata = 25
Nm = 2
frac = [0.7,0.3]
sigmae = [0.05,0.3]
## After generating values from the population model, we now add realistic Gaussian noise
## to create simulated measurements.
sigmahobs = 0.04
sigmakobs = 0.08
h = np.repeat(0.,Ndata)
k = np.repeat(0.,Ndata)
hhat = np.repeat(0.,Ndata)
khat = np.repeat(0.,Ndata)
hhat_sigma = np.repeat(sigmahobs,Ndata)
khat_sigma = np.repeat(sigmakobs,Ndata)
#print(khat_sigma)
for i in range(0,Ndata):
#print('i')
#print(i)
c = np.random.choice(len(frac), 1, p=frac, replace=True)
#print(int(c))
h[i] = rnorm_bound(1,0,sigmae[int(c)],lower_bound=-1,upper_bound=1)
# Euler's formula: h^2 + k^2 = 1
lb = -np.sqrt(1-h[i]**2)
ub = np.sqrt(1-h[i]**2)
k[i] = rnorm_bound(1,0,sigmae[int(c)],lower_bound=lb,upper_bound=ub)
hhat[i] = rnorm_bound(1,h[i],sigmahobs,lower_bound=-1,upper_bound=1)
khat[i] = rnorm_bound(1,k[i],sigmakobs,lower_bound=lb,upper_bound=ub)
print(h, hhat, k, khat)
Below is new JAGS model code for a two-component truncated Gaussian Mixture model:
In [13]:
# JAGS model code
code = '''
model {
#Population parameters
for (j in 1:Nm) {
e_sigma[j] ~ dunif(0.0, 1.0)
e_phi[j] <- 1/(e_sigma[j]*e_sigma[j])
a[j] <- 1;
}
f ~ ddirch(a[])
for (n in 1:Ndata){
#True planet properties
c[n] ~ dcat(f[])
h[n] ~ dnorm(0, e_phi[c[n]]) T(-1,1) #Can try multivariate truncated normal in future
k[n] ~ dnorm(0, e_phi[c[n]]) T(-sqrt(1-h[n]*h[n]),sqrt(1-h[n]*h[n]))
#Observed planet properties
hhat[n] ~ dnorm(h[n], 1.0/(hhat_sigma[n]*hhat_sigma[n])) T(-1,1)
khat[n] ~ dnorm(k[n], 1.0/(khat_sigma[n]*khat_sigma[n])) T(-sqrt(1-hhat[n]*hhat[n]),sqrt(1-hhat[n]*hhat[n]))
}
}
'''
Below, the number of iterations has been increased by a factor of 10. This may slow down your laptop computer even more if it hasn't already crashed. It depends on your laptop specs. The number if iterations may need to me increased even more to reach signs of having "not not" converged.
In [14]:
# Load additional JAGS module
pyjags.load_module('glm')
pyjags.load_module('dic')
#data list include only variables in the model
model = pyjags.Model(code, data=dict(Nm=Nm, Ndata=Ndata, hhat=hhat, khat=khat,
hhat_sigma=hhat_sigma, khat_sigma=khat_sigma),
chains=4, adapt=1000)
# threads=4, chains_per_thread=1
# 500 warmup / burn-in iterations, not used for inference.
model.sample(500, vars=[])
## Run model for desired steps, monitoring hyperparameter variables.
## Returns a dictionary with numpy array for each monitored variable.
## Shapes of returned arrays are (... shape of variable ..., iterations, chains).
##
iters = 100000
samples2 = model.sample(iters, vars=['e_sigma', 'h', 'k', 'c', 'f'])
# Pickle the data
#with open('ecc_1_test.pkl', 'wb') as handle:
# pickle.dump(samples, handle)
# Retrieve pickled data
# with open('ecc_1_test.pkl', 'rb') as handle:
# retrieved_results = pickle.load(handle)
Sort the high and low mixture components to accomidate the exchangeability of parameters. (Future work: reprocess the sorted variables to obtain Gelman-Rubin statistics).
In [15]:
chain_thin = 100
start = int(iters-1000)
esigma_low = np.where(samples2['e_sigma'][0,start::,:] <= samples2['e_sigma'][1,start::,:], samples2['e_sigma'][0,start::,:], samples2['e_sigma'][1,start::,:])
esigma_hi = np.where(samples2['e_sigma'][0,start::,:] > samples2['e_sigma'][1,start::,:], samples2['e_sigma'][0,start::,:], samples2['e_sigma'][1,start::,:])
f_low = np.where(samples2['e_sigma'][0,start::,:] <= samples2['e_sigma'][1,start::,:], samples2['f'][0,start::,:], samples2['f'][1,start::,:])
f_hi = np.where(samples2['e_sigma'][0,start::,:] > samples2['e_sigma'][1,start::,:], samples2['f'][0,start::,:], samples2['f'][1,start::,:])
print(np.min(f_hi))
plt.hist(f_low)
Out[15]:
In [18]:
iters = 100000
#print(samples)
#print(samples.items())
print(samples2['h'].shape)
print(samples2['k'][0,start::,:].shape)
print('-----')
samples_Nm2 = {}
Nsamp=25
thin = 1
numHyperParams = 4
dim = (Nsamp/thin)*2 + numHyperParams
print(dim)
for i in np.arange(0,Nsamp,thin):
samples_Nm2.update({'$h_{'+str(i+1)+'}$': samples2['h'][i,start::,:], '$k_{'+str(i+1)+'}$': samples2['k'][i,start::,:]})
samples_Nm2.update({'$e_{\sigma_{low}}$': esigma_low, '$e_{\sigma_{high}}$': esigma_hi })
samples_Nm2.update({'$f_{low}$': f_low,'$f_{high}$': f_hi })
for j, i in samples_Nm2.items():
print(j)
#print(i)
In [19]:
## equal tailed 95% credible intervals, and posterior distribution means:
def summary(samples, varname, p=95):
values = samples[varname][0]
ci = np.percentile(values, [100-p, p])
print('{:<6} mean = {:>5.1f}, {}% credible interval [{:>4.1f} {:>4.1f}]'.format(
varname, np.mean(values), p, *ci))
for varname in samples_Nm2:
summary(samples_Nm2, varname)
In [20]:
## Use pandas three dimensional Panel to represent the trace:
trace = pd.Panel({k: v for k, v in samples_Nm2.items()})
trace.axes[0].name = 'Variable'
trace.axes[1].name = 'Iteration'
trace.axes[2].name = 'Chain'
## Point estimates:
print(trace.to_frame().mean())
## Bayesian equal-tailed 95% credible intervals:
print(trace.to_frame().quantile([0.05, 0.95]))
## ^ entering the values here could be a good question part
def plot(trace, var):
fig, axes = plt.subplots(1, 3, figsize=(9, 3))
fig.suptitle(var, y=0.95, fontsize='xx-large')
## Marginal posterior density estimate:
trace[var].plot.density(ax=axes[0])
axes[0].set_xlabel('Parameter value')
axes[0].locator_params(tight=True)
## Autocorrelation for each chain:
axes[1].set_xlim(0, 100)
for chain in trace[var].columns:
autocorrelation_plot(trace[var,:,chain], axes[1], label=chain)
## Trace plot:
axes[2].set_ylabel('Parameter value')
trace[var].plot(ax=axes[2])
## Save figure
filename = var.replace("\\", "")
filename = filename.replace("/", "")
filename = filename.replace("$", "")
filename = filename.replace("}", "")
filename = filename.replace("{", "")
plt.tight_layout(pad=3)
fig.savefig('Nm2_h_and_k_JAGS_'+'{}.png'.format(filename))
## Display diagnostic plots
for var in trace:
plot(trace, var)
In [21]:
## Scatter matrix plot:
## Redefine the trace so that we only vizualize every 10th latent variable element in
## the scatter_matrix plot below. Vizualizing all 50 is too cumbersome for the scatter
## matrix.
samples_Nm2_for_scatter_matrix = {}
## adjust the thin varible to only look at every 10th population element by setting it to 10
thin = 10
numHyperParams = 4
dim = (Ndata/thin)*2 + numHyperParams
print(dim)
for i in np.arange(0,Ndata,thin):
samples_Nm2_for_scatter_matrix.update({'$h_{'+str(i+1)+'}$': samples2['h'][i,start::,:], '$k_{'+str(i+1)+'}$': samples2['k'][i,start::,:]})
samples_Nm2_for_scatter_matrix.update({'$e_{\sigma_{low}}$': esigma_low, '$e_{\sigma_{high}}$': esigma_hi })
samples_Nm2_for_scatter_matrix.update({'$f_{low}$': f_low,'$f_{high}$': f_hi })
for j, i in samples_Nm2_for_scatter_matrix.items():
print(j)
# print(i)
trace_3 = pd.Panel({k: v for k, v in samples_Nm2_for_scatter_matrix.items()})
sm = scatter_matrix(trace_3.to_frame(), color="darkturquoise", alpha=0.2, figsize=(dim*2, dim*2), diagonal='hist',hist_kwds={'bins':25,'histtype':'step', 'edgecolor':'r','linewidth':2})
## y labels size
[plt.setp(item.yaxis.get_label(), 'size', 20) for item in sm.ravel()]
## x labels size
[plt.setp(item.xaxis.get_label(), 'size', 20) for item in sm.ravel()]
## Change label rotation
## This is helpful for very long labels
#[s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)]
[s.xaxis.label.set_rotation(0) for s in sm.reshape(-1)]
[s.yaxis.label.set_rotation(0) for s in sm.reshape(-1)]
## May need to offset label when rotating to prevent overlap of figure
[s.get_yaxis().set_label_coords(-0.5,0.5) for s in sm.reshape(-1)]
## Hide all ticks
#[s.set_xticks(()) for s in sm.reshape(-1)]
#[s.set_yticks(()) for s in sm.reshape(-1)]
plt.savefig('scatter_matrix_Nm2_h_and_k_JAGS.png')